Differential expression using the bulk RNAseq data from the May release.
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(stringr)
library(MASS)
library(RColorBrewer)
library(DESeq2)
library(viridis)
library(ggpointdensity)
library(dplyr)
library(data.table)
library(ComplexHeatmap)
library(UpSetR)
library(readxl)
theme_set(theme_classic())
path = "~/research/data/MorPhiC/May-2024/JAX_RNAseq2_ExtraEmbryonic/"
dat_path = paste0(path, "processed/release110-gencode44/Tables")## [1] 82 32
## biomaterial_id lib_biomaterial_id differentiated_biomaterial_id
## 1 MOK20011-WT1 GT23-13719 PrS-MOK20011-WT1
## 2 MOK20011-WT2 GT23-13720 PrS-MOK20011-WT2
## differentiated_biomaterial_description differentiation_protocol
## 1 KOLF2.2 derived primitive syncytium JAXPD001
## 2 KOLF2.2 derived primitive syncytium JAXPD001
## differentiated_timepoint_value differentiated_timepoint_unit
## 1 6 days
## 2 6 days
## differentiated_terminally_differentiated
## 1 Yes
## 2 Yes
## model_organ ko_strategy ko_gene
## 1 extra-embryonic primitive syncytial cells WT WT
## 2 extra-embryonic primitive syncytial cells WT WT
## library_preparation_protocol dissociation_protocol fragment_size input_amount
## 1 JAXPL001 N.A 417 300
## 2 JAXPL001 N.A 402 300
## input_unit final_yield final_yield_unit lib_concentration
## 1 ngs 190.0 ngs 34.51784
## 2 ngs 91.2 ngs 17.18679
## lib_concentration_unit PCR_cycle PCR_cycle_sample_index
## 1 nM 10 NA
## 2 nM 10 NA
## file_id sequencing_protocol
## 1 RUNX1_WT1_GT23-13719_CCAAGTCT-TCATCCTT_S170_L003 JAXPS001
## 2 RUNX1_WT2_GT23-13720_TTGGACTC-CTGCTTCC_S157_L003 JAXPS001
## run_id biomaterial_description derived_cell_line_accession
## 1 20231124_23-robson-013 KOLF2.2 KOLF2.2J
## 2 20231124_23-robson-013 KOLF2.2 KOLF2.2J
## clone_id protocol_id zygosity type model_system
## 1 WT1 N.A N.A iPSC PrS
## 2 WT2 N.A N.A iPSC PrS
## [1] "biomaterial_id"
## [2] "lib_biomaterial_id"
## [3] "differentiated_biomaterial_id"
## [4] "differentiated_biomaterial_description"
## [5] "differentiation_protocol"
## [6] "differentiated_timepoint_value"
## [7] "differentiated_timepoint_unit"
## [8] "differentiated_terminally_differentiated"
## [9] "model_organ"
## [10] "ko_strategy"
## [11] "ko_gene"
## [12] "library_preparation_protocol"
## [13] "dissociation_protocol"
## [14] "fragment_size"
## [15] "input_amount"
## [16] "input_unit"
## [17] "final_yield"
## [18] "final_yield_unit"
## [19] "lib_concentration"
## [20] "lib_concentration_unit"
## [21] "PCR_cycle"
## [22] "PCR_cycle_sample_index"
## [23] "file_id"
## [24] "sequencing_protocol"
## [25] "run_id"
## [26] "biomaterial_description"
## [27] "derived_cell_line_accession"
## [28] "clone_id"
## [29] "protocol_id"
## [30] "zygosity"
## [31] "type"
## [32] "model_system"
##
## 1
## 82
##
## 1
## 82
##
## 1
## 82
## [1] 38592 83
## Name MEIS2_CE_A06_GT23-12174_TGCGAGAC-CAACAATG_S1_L001
## 1 ENSG00000268674 0
## 2 ENSG00000271254 793
## MEF2C_PTC_B03_GT23-13717_TCTCTACT-GAACCGCG_S177_L003
## 1 0
## 2 937
## RUNX1_KO_G07_GT23-13724_CGGCGTGA-ACAGGCGC_S175_L003
## 1 0
## 2 1079
stopifnot(setequal(names(cts)[-1], meta$file_id))
meta = meta[match(names(cts)[-1], meta$file_id),]
table(names(cts)[-1] == meta$file_id)##
## TRUE
## 82
model_s = meta$model_system
get_q75 <- function(x){quantile(x, probs = 0.75)}
gene_info = data.frame(Name = cts$Name,
t(apply(cts_dat, 1, tapply, model_s, min)),
t(apply(cts_dat, 1, tapply, model_s, median)),
t(apply(cts_dat, 1, tapply, model_s, get_q75)),
t(apply(cts_dat, 1, tapply, model_s, max)))
names(gene_info)[2:9] = paste0(rep(c("ExM_", "PrS_"), times=4),
rep(c("min", "median", "q75", "max"), each=2))
dim(gene_info)## [1] 38592 9
## Name ExM_min PrS_min ExM_median PrS_median ExM_q75
## ENSG00000268674 ENSG00000268674 0 0 0 0.0 0
## ENSG00000271254 ENSG00000271254 594 512 797 958.5 858
## PrS_q75 ExM_max PrS_max
## ENSG00000268674 0.00 0 1
## ENSG00000271254 1051.75 1079 1695
## ExM_min PrS_min ExM_median PrS_median
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 0.0 Median : 0.0 Median : 2.0 Median : 1.5
## Mean : 323.5 Mean : 335.2 Mean : 593.2 Mean : 647.1
## 3rd Qu.: 100.0 3rd Qu.: 71.0 3rd Qu.: 217.0 3rd Qu.: 177.5
## Max. :240570.0 Max. :209625.0 Max. :883830.0 Max. :382389.5
## ExM_q75 PrS_q75 ExM_max PrS_max
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 1.0 1st Qu.: 1
## Median : 3.8 Median : 3.0 Median : 8.0 Median : 8
## Mean : 731.5 Mean : 772.1 Mean : 1063.6 Mean : 1188
## 3rd Qu.: 279.0 3rd Qu.: 217.6 3rd Qu.: 400.2 3rd Qu.: 330
## Max. :1028917.5 Max. :454140.8 Max. :1675470.0 Max. :773629
##
## FALSE TRUE
## FALSE 12951 1098
## TRUE 1885 22658
## [1] 25641 82
gene_info = gene_info[w2kp,]
meta$read_depth = colSums(cts_dat)/1e6
meta$q75 = apply(cts_dat, 2, quantile, probs = 0.75)
ggplot(meta, aes(x=read_depth, y=q75, color=ko_strategy, shape = model_system)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "KO type", shape = "Model") +
scale_color_brewer(palette = "Set1")meta$run_id_short = str_replace(meta$run_id, "^[^-]*-", "")
ggplot(meta, aes(x=read_depth, y=q75, color=run_id_short, shape = model_system)) +
geom_point(size=2, alpha=0.7) + ggtitle("Gene counts") +
scale_shape_manual(values = c(7, 16)) +
xlab("read depth (million)") + ylab("75 percentile of gene expression") +
labs(color = "Run ID", shape = "Model") +
scale_color_brewer(palette = "Set1")sample2kp = which(meta$ko_gene == "WT")
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
summary(meta_m$q75)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 428.0 510.0 554.0 574.9 649.0 727.0
## [1] 25641 21
stopifnot(all(meta_m$file_id == names(cts_dat_m)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)## [1] 21 24263
## [1] "sdev" "rotation" "center" "scale" "x"
## [1] 0.47564883 0.12518353 0.07086713 0.05336636 0.02717944
## [1] 47.6 12.5 7.1 5.3 2.7
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=model_system,
color=run_id_short)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
labs(color="KO gene", shape ="KO strategy") +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle("Wild type samples") +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)## Generate sample information matrix
cols2kp = c("model_system", "q75")
sample_info = meta_m[,cols2kp]
dim(sample_info)## [1] 21 2
## model_system q75
## 59 PrS 554
## 25 PrS 428
sample_info$log_q75 = log(sample_info$q75)
dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info,
~ model_system + log_q75)
dds = DESeq(dds)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)## [1] 25641 6
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 944.5427577 0.1298778 0.3327253 0.3903454 0.6962812
## ENSG00000276345 0.3686712 -5.3837662 9.7887890 -0.5499931 0.5823241
## padj
## ENSG00000271254 0.8840768
## ENSG00000276345 NA
##
## FALSE TRUE
## 14702 10939
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("Read depth")
print(g0)## association with model_system
dds_lrt = DESeq(dds, test="LRT", reduced = ~ + log_q75)
res_lrt = results(dds_lrt)
res_model_system = as.data.frame(res_lrt)
dim(res_model_system)## [1] 25641 6
## baseMean log2FoldChange lfcSE stat pvalue
## ENSG00000271254 944.5427577 0.1298778 0.3327253 31.92267820 1.604331e-08
## ENSG00000276345 0.3686712 -5.3837662 9.7887890 -0.01029894 1.000000e+00
## padj
## ENSG00000271254 4.7729e-08
## ENSG00000276345 1.0000e+00
##
## FALSE TRUE
## 24642 999
g0 = ggplot(res_model_system %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle("Model system")
print(g0)##
## ExM PrS
## 20230809_23-robson-007-run2 0 12
## 20231004_23-robson-011 22 0
## 20231124_23-robson-013 12 12
## 20240307_24-robson-002 0 12
## 20240307_24-robson-005 0 12
##
## BHLHE40 MEF2C MEIS1 MEIS2 MXD1 NCOA3 RUNX1 WT
## 20230809_23-robson-007-run2 0 0 0 0 9 0 0 3
## 20231004_23-robson-011 0 0 9 7 0 0 0 6
## 20231124_23-robson-013 0 9 0 0 0 0 9 6
## 20240307_24-robson-002 9 0 0 0 0 0 0 3
## 20240307_24-robson-005 0 0 0 0 0 9 0 3
##
## BHLHE40 MEF2C MEIS1 MEIS2 MXD1 NCOA3 RUNX1 WT
## CE 3 3 3 3 3 3 3 0
## KO 3 3 3 1 3 3 3 0
## PTC 3 3 3 3 3 3 3 0
## WT 0 0 0 0 0 0 0 21
for(model1 in unique(meta$model_system)){
print(model1)
sample2kp = which(meta$model_system == model1)
cts_dat_m = cts_dat[,sample2kp]
meta_m = meta[sample2kp,]
table(meta_m$ko_strategy, substr(colnames(cts_dat_m), 1, 7))
stopifnot(all(meta_m$file_id == names(cts_dat_m)))
q75 = apply(cts_dat_m, 1, quantile, probs=0.75)
cts_dat_n = t(cts_dat_m[q75 > 0,])
cts_dat_n = log(median(meta_m$q75)*cts_dat_n/meta_m$q75 + 1)
dim(cts_dat_n)
sample_pca = prcomp(cts_dat_n, center = TRUE, scale. = TRUE)
names(sample_pca)
pc_scores = sample_pca$x
eigen_vals = sample_pca$sdev^2
eigen_vals[1:5]/sum(eigen_vals)
pvs = round(eigen_vals[1:5]/sum(eigen_vals)*100,1)
pvs
PC_df = data.frame(pc_scores)
PC_df = cbind(PC_df, meta_m)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=ko_strategy, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(model1) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
gPC = ggplot(PC_df, aes(x = PC1, y = PC2, shape=run_id_short, color=ko_gene)) +
geom_point(size=2.5, alpha=0.7) + scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values = c(7, 15, 16, 17)) +
xlab(sprintf("PC1 (%.1f%% variance)", pvs[1])) +
ylab(sprintf("PC2 (%.1f%% variance)", pvs[2])) +
ggtitle(model1) +
guides(
color = guide_legend(title = NULL, order = 1),
shape = guide_legend(title = NULL, order = 2)
) +
theme(
legend.margin = margin(0, 0, 0, 0),
legend.box.just = "left",
legend.direction = "vertical"
)
print(gPC)
table(meta_m$run_id, meta_m$ko_gene)
## Generate sample information matrix
cols2kp = c("ko_strategy", "ko_gene", "run_id", "q75")
sample_info = meta_m[,cols2kp]
dim(sample_info)
sample_info[1:2,]
table(sample_info$ko_strategy, sample_info$ko_gene)
sample_info$ko_group = paste(sample_info$ko_gene,
sample_info$ko_strategy, sep="_")
table(sample_info$ko_group)
sample_info$log_q75 = log(sample_info$q75)
dds = DESeqDataSetFromMatrix(cts_dat_m, colData=sample_info,
~ ko_group + run_id + log_q75)
dds = DESeq(dds)
## association with read-depth
res_rd = results(dds, name = "log_q75")
res_rd = as.data.frame(res_rd)
dim(res_rd)
res_rd[1:2,]
table(is.na(res_rd$padj))
g0 = ggplot(res_rd %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(model1, " read depth"))
print(g0)
## association with run_id
dds_lrt = DESeq(dds, test="LRT", reduced = ~ ko_group + log_q75)
res_lrt = results(dds_lrt)
res_run_id = as.data.frame(res_lrt)
dim(res_run_id)
res_run_id[1:2,]
table(is.na(res_run_id$padj))
g0 = ggplot(res_run_id %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(model1, " Run ID"))
print(g0)
## DE analysis for each KO gene
table(sample_info$ko_gene)
genos = setdiff(unique(sample_info$ko_gene), "WT")
genos
ko_grp = c("CE", "KO", "PTC")
res_geno_df = NULL
for(geno in genos){
res_geno = list()
for(k_grp1 in ko_grp){
res_g1 = results(dds, contrast = c("ko_group",
paste0(geno, "_", k_grp1), "WT_WT"))
res_g1 = signif(data.frame(res_g1), 3)
res_geno[[k_grp1]] = res_g1
g1 = ggplot(res_g1 %>% subset(!is.na(padj)), aes(x=pvalue)) +
geom_histogram(color="darkblue", fill="lightblue",
breaks = seq(0,1,by=0.01)) +
ggtitle(paste0(geno, "_", k_grp1))
print(g1)
tag1 = sprintf("%s_%s_%s_", model1, geno, k_grp1)
colnames(res_g1) = paste0(tag1, colnames(res_g1))
if(is.null(res_geno_df)){
res_geno_df = res_g1
}else if(!is.null(res_geno_df)){
stopifnot(all(rownames(res_geno_df) == rownames(res_g1)))
res_geno_df = cbind(res_geno_df, res_g1)
}
}
get_index <- function(x){
which(x$padj < 0.05 & abs(x$log2FoldChange) > log2(1.5))
}
w_ce = get_index(res_geno[["CE"]])
w_ko = get_index(res_geno[["KO"]])
w_ptc = get_index(res_geno[["PTC"]])
m = make_comb_mat(list("CE" = rownames(res_geno[["CE"]])[w_ce],
"KO" = rownames(res_geno[["KO"]])[w_ko],
"PTC" = rownames(res_geno[["PTC"]])[w_ptc]))
g_up = UpSet(m)
print(g_up)
df1 = data.frame(padj_CE = res_geno[["CE"]]$padj,
padj_KO = res_geno[["KO"]]$padj,
padj_PTC = res_geno[["PTC"]]$padj)
cr1 = cor(df1$padj_PTC, df1$padj_CE, method = "spearman", use="pair")
cr2 = cor(df1$padj_PTC, df1$padj_KO, method = "spearman", use="pair")
g4 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_CE))) +
geom_pointdensity() + ggtitle(sprintf("%s, Spearman r = %.2f", geno, cr1)) +
scale_color_viridis()
g5 = ggplot(data = df1, mapping = aes(x = -log10(padj_PTC),
y = -log10(padj_KO))) +
geom_pointdensity() + ggtitle(sprintf("%s, Spearman r = %.2f", geno, cr2)) +
scale_color_viridis()
print(g4)
print(g5)
}
dim(res_geno_df)
res_geno_df[1:2,]
res_df = data.frame(gene_ID=rownames(res_geno_df), res_geno_df)
dim(res_df)
res_df[1:2,]
fwrite(res_df, sprintf("results/2024_05_%s_DEseq2.tsv", model1), sep="\t")
}## [1] "ExM"
## [1] "PrS"
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 7724183 412.6 12436585 664.2 NA 12436585 664.2
## Vcells 33734052 257.4 95929172 731.9 65536 95929172 731.9
## R version 4.2.3 (2023-03-15)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] readxl_1.4.2 UpSetR_1.4.0
## [3] ComplexHeatmap_2.14.0 data.table_1.14.8
## [5] dplyr_1.1.2 ggpointdensity_0.1.0
## [7] viridis_0.6.2 viridisLite_0.4.1
## [9] DESeq2_1.38.3 SummarizedExperiment_1.28.0
## [11] Biobase_2.58.0 MatrixGenerics_1.10.0
## [13] matrixStats_1.0.0 GenomicRanges_1.50.2
## [15] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [17] S4Vectors_0.36.2 BiocGenerics_0.44.0
## [19] RColorBrewer_1.1-3 MASS_7.3-58.2
## [21] stringr_1.5.0 ggpubr_0.6.0
## [23] ggrepel_0.9.3 ggplot2_3.4.2
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.1-0 ggsignif_0.6.4 rjson_0.2.21
## [4] circlize_0.4.15 XVector_0.38.0 GlobalOptions_0.1.2
## [7] clue_0.3-65 rstudioapi_0.14 farver_2.1.1
## [10] bit64_4.0.5 AnnotationDbi_1.60.2 fansi_1.0.4
## [13] codetools_0.2-19 doParallel_1.0.17 cachem_1.0.7
## [16] geneplotter_1.76.0 knitr_1.44 jsonlite_1.8.4
## [19] broom_1.0.4 annotate_1.76.0 cluster_2.1.4
## [22] png_0.1-8 compiler_4.2.3 httr_1.4.6
## [25] backports_1.4.1 Matrix_1.6-4 fastmap_1.1.1
## [28] cli_3.6.1 htmltools_0.5.5 tools_4.2.3
## [31] gtable_0.3.3 glue_1.6.2 GenomeInfoDbData_1.2.9
## [34] Rcpp_1.0.10 carData_3.0-5 cellranger_1.1.0
## [37] jquerylib_0.1.4 vctrs_0.6.2 Biostrings_2.66.0
## [40] iterators_1.0.14 xfun_0.39 lifecycle_1.0.3
## [43] rstatix_0.7.2 XML_3.99-0.14 zlibbioc_1.44.0
## [46] scales_1.2.1 parallel_4.2.3 yaml_2.3.7
## [49] memoise_2.0.1 gridExtra_2.3 sass_0.4.5
## [52] stringi_1.7.12 RSQLite_2.3.1 foreach_1.5.2
## [55] BiocParallel_1.32.6 shape_1.4.6 rlang_1.1.0
## [58] pkgconfig_2.0.3 bitops_1.0-7 evaluate_0.20
## [61] lattice_0.20-45 purrr_1.0.1 labeling_0.4.2
## [64] bit_4.0.5 tidyselect_1.2.0 plyr_1.8.8
## [67] magrittr_2.0.3 R6_2.5.1 generics_0.1.3
## [70] DelayedArray_0.24.0 DBI_1.1.3 pillar_1.9.0
## [73] withr_2.5.0 KEGGREST_1.38.0 abind_1.4-5
## [76] RCurl_1.98-1.12 tibble_3.2.1 crayon_1.5.2
## [79] car_3.1-2 utf8_1.2.3 rmarkdown_2.21
## [82] GetoptLong_1.0.5 locfit_1.5-9.8 blob_1.2.4
## [85] digest_0.6.31 xtable_1.8-4 tidyr_1.3.0
## [88] munsell_0.5.0 bslib_0.4.2